import pandas as pd
import os
import logging
LOGGER = logging.getLogger(__name__)
from init import PATHS



def read_rdd_data():
    df_rdd = pd.read_csv(PATHS.dropbox / 'Data_outputted/D_Policy/RDD1995_2020.csv')
    df_rdd = df_rdd.rename(columns={'USA/DOE': 'USA'}) # can do a robustness using the USA/IEA column instead
    df_rdd['technology'] = df_rdd['technology'].replace('HGENCELL', 'fchydrogen')
    df_rdd['technology'] = df_rdd['technology'].replace('OTHERPANDS', 'otherstorage')
    df_rdd = pd.melt(df_rdd, id_vars=['year', 'technology'], var_name='country', value_name='rdd_public_spending')
    df_rdd = df_rdd.rename(columns={'country': 'Country', 'year': 'Year'})
    df_rdd = df_rdd.pivot(index=['Country', 'Year'], columns='technology', values='rdd_public_spending')
    df_rdd = df_rdd.reset_index()
    df_rdd['Country'] = df_rdd['Country'].apply(lambda x: x.capitalize()if x not in ['USA', 'UK'] else x)
    return df_rdd



def read_supplier_firmlevel_panel():
    """ Supplier panel, and adds categorical variable for whether the supplier is active that year, part of the ''old guard'' """
    dffirmyear = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Panel_suppliers_FamInfo.csv')  # 1475
    unique_sids = {bvdid: idx for idx, bvdid in enumerate(dffirmyear['bvdid'].unique(), start=1)}
    dffirmyear['sid'] = dffirmyear['bvdid'].map(unique_sids)
    # Add a variable "Active" that flags whether the supplier is active in that year
    links = read_panel_of_links()
    # Keep only active links
    links = links[links['Active']]
    links = links[['Year', 'bvdid']].drop_duplicates().rename(columns={'Year': 'earliest_filing_year'})
    links['Active'] = 1
    dffirmyear = dffirmyear.merge(links, on=['bvdid', 'earliest_filing_year'], how='left')
    dffirmyear['Active'].fillna(0, inplace=True)
    # Old guard dummy; definition of old guard: if the bvdid had an active link before 2009
    oldguard_bvdids = dffirmyear[(dffirmyear['Active'] == 1) & (dffirmyear['earliest_filing_year'] < 2009)]['bvdid'].drop_duplicates().to_list()
    dffirmyear['oldguard'] = dffirmyear['bvdid'].apply(lambda x: 1 if x in oldguard_bvdids else 0)
    # ## add NAICS
    naics = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_naics.csv')
    naics = naics.rename(columns={'BvD ID number': 'bvdid'})
    naics['4digitNAICS'] = naics['NAICS, Primary code(s)'].apply(lambda x: str(x)[:4])
    naics = naics[['bvdid', '4digitNAICS']].drop_duplicates()
    naics = naics.groupby('bvdid').agg(lambda x: ','.join(x.unique())).reset_index()
    dffirmyear = dffirmyear.merge(naics, on='bvdid', how='left')
    dffirmyear = dffirmyear[['sid','earliest_filing_year','Active','4digitNAICS','oldguard','Count_Bat_excl','Count_FC_excl']]
    return dffirmyear


def get_oem_supplier_activelinks_with_supplierstocks(balanced=False):
    dfsupplieryear = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Panel_suppliers_FamInfo.csv')
    unique_sids = {bvdid: idx for idx, bvdid in enumerate(dfsupplieryear['bvdid'].unique(), start=1)}
    dfsupplieryear['sid'] = dfsupplieryear['bvdid'].map(unique_sids)
    techs = ['_Bat', '_FC']
    patenting_variables = [f'Stock{s}' for s in techs]
    cols1 = ['bvdid', 'sid','earliest_filing_year'] + patenting_variables
    dfsupplieryear = dfsupplieryear[cols1]
    # Import Links
    links = read_panel_of_links()
    if balanced:
        maskBalanced = links['year_company_first_appears'] < 2009
        links = links[maskBalanced]
    cols2 = ['OEM_Level1_ID', 'bvdid', 'Year', 'Active', 'Years_since_firstactive']
    links = links[cols2]
    links = links[links['Active']]
    oemsupplierlinks_supplierstocks = links.merge(dfsupplieryear, left_on=['bvdid', 'Year'], right_on=['bvdid', 'earliest_filing_year'], how='inner')
    oemsupplierlinks_supplierstocks = oemsupplierlinks_supplierstocks.drop('bvdid', axis=1)
    return oemsupplierlinks_supplierstocks


def read_panel_of_links():
    links = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdSuppPanel.csv')
    links = links.rename(columns={'SUPPLIER_ID': 'fctid', 'supplier_fctname': 'fctname'})
    supplierids = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_ids.csv')
    #  needed to link between the fctids of links and the bvdids of suppliers.
    links = links.merge(supplierids, on=['fctid', 'fctname'])
    return links


